Load packages

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
source("functions.R")

I downloaded the file into R

download.file("https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/_episodes_rmd/data/gapminder-FiveYearData.csv", destfile = "data/gapminder-FiveYearData.csv")

gapminder <- read.csv("data/gapminder-FiveYearData.csv")
head(gapminder)
##       country year      pop continent lifeExp gdpPercap
## 1 Afghanistan 1952  8425333      Asia  28.801  779.4453
## 2 Afghanistan 1957  9240934      Asia  30.332  820.8530
## 3 Afghanistan 1962 10267083      Asia  31.997  853.1007
## 4 Afghanistan 1967 11537966      Asia  34.020  836.1971
## 5 Afghanistan 1972 13079460      Asia  36.088  739.9811
## 6 Afghanistan 1977 14880372      Asia  38.438  786.1134

I wonder what is the life expectancy over the years

ggplot(data = gapminder, aes(x=year,y=lifeExp))+geom_point()

interactive version

if(!require("plotly")){install.packages("plotly")}
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(plotly)

p <- ggplot(data = gapminder[gapminder$continent=="Americas",], aes(x=gdpPercap,y=lifeExp,color=continent,by=country))+
  geom_point()+
  scale_x_log10()+
  geom_smooth(method = "lm")+
  facet_wrap(~country)+
  xlab("Gross Domestic Product")+
  ylab("Life Expectancy")

ggplotly(p)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

Making your own functions

If you are repeating yourself in your code, you may be able to solve that problem by making your own function!

cars <- c(3,4,5,6,7,10)
se(cars)
## [1] 1.013794

Data manipulation with dplyr

You will likely want to get subsections of your dataframe and/or calculate means of a variable for a certain subsection, dplyr is your friend!

Explore select

year_country_gdp <- select(gapminder,year,country,gdpPercap)
year_country_gdp <- select(gapminder,-pop,-continent,-lifeExp)
names(year_country_gdp)
## [1] "country"   "year"      "gdpPercap"

Explore filter

year_country_gdp_euro <- gapminder %>% 
  filter(continent=="Europe") %>% 
  select(year,country,gdpPercap)

euro <- filter(gapminder,continent=="Europe")
year_country_gdp_euro <- select(euro,year,country,gdpPercap)

Exploring the amazing group_by and summarize functions

mean_gdp_percountry <- gapminder %>%
  group_by(country) %>% 
  summarise(mean_gdp=mean(gdpPercap),
            se_gdp=se(gdpPercap))

mean_gdp_percountry
## # A tibble: 142 × 3
##        country   mean_gdp     se_gdp
##         <fctr>      <dbl>      <dbl>
## 1  Afghanistan   802.6746   31.23550
## 2      Albania  3255.3666  344.20223
## 3      Algeria  4426.0260  378.26190
## 4       Angola  3607.1005  336.56641
## 5    Argentina  8955.5538  537.68144
## 6    Australia 19980.5956 2256.11315
## 7      Austria 20411.9163 2787.23968
## 8      Bahrain 18077.6639 1563.29518
## 9   Bangladesh   817.5588   67.86165
## 10     Belgium 19900.7581 2422.32683
## # ... with 132 more rows
mean_lifeExp_percontinent <- gapminder %>%
  group_by(continent,country) %>% 
  summarise(mean_lifeExp=mean(lifeExp),
            se_lifeExp=se(lifeExp),
            length_lifeExp=n())

mean_lifeExp_percontinent
## Source: local data frame [142 x 5]
## Groups: continent [?]
## 
##    continent                  country mean_lifeExp se_lifeExp
##       <fctr>                   <fctr>        <dbl>      <dbl>
## 1     Africa                  Algeria     59.03017  2.9849208
## 2     Africa                   Angola     37.88350  1.1562236
## 3     Africa                    Benin     48.77992  1.7691977
## 4     Africa                 Botswana     54.59750  1.7116922
## 5     Africa             Burkina Faso     44.69400  1.9762099
## 6     Africa                  Burundi     44.81733  0.9165096
## 7     Africa                 Cameroon     48.12850  1.5784640
## 8     Africa Central African Republic     43.86692  1.3627459
## 9     Africa                     Chad     46.77358  1.4110376
## 10    Africa                  Comoros     52.38175  2.3476081
## # ... with 132 more rows, and 1 more variables: length_lifeExp <int>

Combining ggplot and dplyr

euro_countries <- gapminder %>% 
  filter(continent=="Europe") %>% 
  ggplot(aes(x=year,y=lifeExp,color=country)) +
  geom_line()+
  facet_wrap(~country)

euro_countries

ggsave("euro.png")
## Saving 7 x 5 in image
write.csv(mean_gdp_percountry,"processed/mean_gdp_percountry.csv")

Data manipulation with tidyr

R likes to have ‘long’ format data where every row is an observation and you have a single column for ‘observations’ the others serve to identify that observation. (exceptions apply when you have multiple types of observations) To switch back and forth from ‘wide’ (how we typically enter data in a spreadsheet) to ‘long’ use tidyr

# command to download the 'wide' data
download.file("https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/data/gapminder_wide.csv", destfile = "data/gapminder_wide.csv")

gapminder_wide <- read.csv("data/gapminder_wide.csv")

gap_long <- gapminder_wide %>% 
  gather(obstype_year,obs_values,
         starts_with('pop'),starts_with('lifeExp'),starts_with('gdpPercap'))

gap_long <- gapminder_wide %>% 
  gather(obstype_year,obs_values,
         3:38)

Separate the obs_type column

gap_normal <- gap_long %>%
  separate(obstype_year,into=c("obs_type","year"),sep="_") %>% 
  spread(obs_type,obs_values)

head(gap_normal)
##   continent country year gdpPercap lifeExp      pop
## 1    Africa Algeria 1952  2449.008  43.077  9279525
## 2    Africa Algeria 1957  3013.976  45.685 10270856
## 3    Africa Algeria 1962  2550.817  48.303 11000948
## 4    Africa Algeria 1967  3246.992  51.407 12760499
## 5    Africa Algeria 1972  4182.664  54.518 14760787
## 6    Africa Algeria 1977  4910.417  58.014 17152804
gap_normal <- gap_normal %>% 
  arrange(country,continent,year)

all.equal(gapminder,gap_normal)
##  [1] "Names: 5 string mismatches"                                                                            
##  [2] "Component 1: Attributes: < Component \"levels\": Lengths (142, 5) differ (string compare on first 5) >"
##  [3] "Component 1: Attributes: < Component \"levels\": 5 string mismatches >"                                
##  [4] "Component 1: 1704 string mismatches"                                                                   
##  [5] "Component 2: Attributes: < target is NULL, current is list >"                                          
##  [6] "Component 2: target is numeric, current is factor"                                                     
##  [7] "Component 3: Modes: numeric, character"                                                                
##  [8] "Component 3: target is numeric, current is character"                                                  
##  [9] "Component 4: 'current' is not a factor"                                                                
## [10] "Component 6: Mean relative difference: 4101.546"